Feature Interpretation For Breed Classifier
Imports¶
In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
from fastai import *
from fastai.vision import *
In [3]:
np.random.seed(2)
Data¶
In [4]:
path = untar_data(URLs.PETS)
In [5]:
path
Out[5]:
In [6]:
path.ls()
Out[6]:
In [7]:
path_anno = path/'annotations'
path_img = path/'images'
In [8]:
fnames = get_image_files(path_img)
fnames[:5]
Out[8]:
Load data¶
In [9]:
pat = r'/([^/]+)_\d+.jpg$'
In [10]:
data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224)
In [11]:
# normalize (same mean and sd) for each pixel channel (rgb) value
# motivation: control for variation in camera brightness
data.normalize(imagenet_stats)
In [393]:
data.show_batch(rows=3, figsize=(10,10))
In [13]:
print(data.classes)
len(data.classes),data.c
Out[13]:
Train¶
Final Layer Only¶
In [14]:
learn = ConvLearner(data, models.resnet34, metrics=error_rate)
In [ ]:
learn.lr_find()
learn.recorder.plot()
In [16]:
learn = ConvLearner(data, models.resnet34, metrics=error_rate)
learn.fit_one_cycle(4,max_lr=3*10**-3)
Previous Layers Also¶
In [ ]:
# unfreeze
learn.unfreeze()
# find best learning rate
learn.lr_find()
learn.recorder.plot()
In [22]:
#train previous layers also
learn.fit_one_cycle(2, max_lr=slice(1e-6,1e-4))
Get last layer¶
In [23]:
# this is a hook (learned about it here: https://forums.fast.ai/t/how-to-find-similar-images-based-on-final-embedding-layer/16903/13)
class SaveFeatures():
features=None
def __init__(self, m):
self.hook = m.register_forward_hook(self.hook_fn)
self.features = None
def hook_fn(self, module, input, output):
out = output.detach().cpu().numpy()
if isinstance(self.features, type(None)):
self.features = out
else:
self.features = np.row_stack((self.features, out))
def remove(self):
self.hook.remove()
In [50]:
#model to eval mode (don't use dropout anymore)
learn.model.eval()
# Get validation features
# specify layer to store (last real layer before the predictions (last two are a dropout layer and the actual predictions))
sf = SaveFeatures(learn.model[-1][-3])
# predict TODO:Refactor and understand last 3 lines
n_valid = len(data.valid_ds.ds.y)
for i in range(n_valid):
# get image
img,label = data.valid_dl.dl.dataset[i]
# transform
img = apply_tfms(learn.data.valid_ds.tfms, img, **learn.data.valid_ds.kwargs)
# lines copied from fastai.vision.learner._predict
ds = TensorDataset(img.data[None], torch.zeros(1))
dl = DeviceDataLoader.create(ds, bs=1, shuffle=False, device=learn.data.device, tfms=learn.data.valid_dl.tfms,
num_workers=0)
pred = learn.model(dl.one_batch()[0])
if i % 1000 == 0:
print(f'{i/n_valid*100:.2f}% ready')
# get features
X_valid = sf.features
# store labels
y_valid = data.valid_ds.ds.y
In [ ]:
# Get training set features
# specify layer to store (last real layer before the predictions (last two are a dropout layer and the actual predictions))
sf = SaveFeatures(learn.model[-1][-3])
# predict TODO:Refactor and understand last 3 lines
n_train = len(data.train_ds.ds.y)
for i in range(n_train):
# get image
img,label = data.train_dl.dl.dataset[i]
# transform
img = apply_tfms(learn.data.valid_ds.tfms, img, **learn.data.valid_ds.kwargs)
# lines copied from fastai.vision.learner._predict
ds = TensorDataset(img.data[None], torch.zeros(1))
dl = DeviceDataLoader.create(ds, bs=1, shuffle=False, device=learn.data.device, tfms=learn.data.valid_dl.tfms,
num_workers=0)
pred = learn.model(dl.one_batch()[0])
if i % 1000 == 0:
print(f'{i/n_train*100:.2f}% ready')
# get features
X_train = sf.features
# store labels
y_train = data.train_ds.ds.y
PCA: Make features linearly independent and sort them by how much variation they capture in the between the observations¶
In [93]:
from sklearn.decomposition import PCA
n_components = X_train.shape[1]
pca = PCA(n_components)
# fit on training data, to avoid overfitting if the PCA features are used for a classifier
PCA_X_train = pca.fit_transform(X_train)
PCA_X_valid = pca.transform(X_valid)
In [94]:
# train logistic regression with L1 Regularization to learn which features are most important for classifying specific breeds
Cs = np.logspace(-4,4,10)
results = pd.DataFrame(index=Cs,columns=['train_score','valid_score'])
for C in Cs:
clf = LogisticRegression(penalty='l1',C=C).fit(PCA_X_train,y_train)
results.loc[C,'train_score'] = clf.score(PCA_X_train,y_train)
results.loc[C,'valid_score'] = clf.score(PCA_X_valid,y_valid)
In [116]:
# inspect optimal C
results
Out[116]:
In [117]:
# train model with optimal C
clf = LogisticRegression(penalty='l1',C=0.36).fit(PCA_X_train,y_train)
print(clf.score(PCA_X_valid,y_valid))
In [130]:
coefs = clf.coef_
In [133]:
feature_importances = np.abs(coefs).sum(axis=0)
feautre_importances
Explore features¶
In [135]:
#sort all images by their score on each feature
sorts = {}
for i in range(n_components):
sorts[i] = np.argsort(PCA_X_valid[:,i])[::-1]
In [302]:
# function to plot a list of image ids in the validation set
def show_valid(id_list,n_cols=5):
fig, axs = plt.subplots(int(np.ceil(len(id_list)/n_cols)),n_cols,figsize=(20,10))
for i, ax in enumerate(axs.flatten()):
img, label = data.valid_dl.dl.dataset[id_list[i]]
show_image(img, ax=ax)
return plt.tight_layout()
In [389]:
f=0
print(f'Pets with LEAST / MOST of feature {f}')
show_valid(sorts[f][-6:-1])
show_valid(sorts[f][0:5])
In [388]:
f=1
print(f'Pets with LEAST / MOST of feature {f}')
show_valid(sorts[f][-6:-1])
show_valid(sorts[f][0:5])
In [360]:
# normalize the PCA features (mean=0, sd=1, so we can find the ones which score highest on both dimensions)
from sklearn.preprocessing import normalize
nPCA_X_valid = normalize(PCA_X_valid)
In [382]:
# make a dataframe with the top 3 features
df = pd.DataFrame({'doggyness':nPCA_X_valid[:,0],
'hairyness':nPCA_X_valid[:,1]})
df['naked_cat'] = (-1*df.doggyness)**0.5 + (-1*df.hairyness)**0.5
df['naked_dog'] = (1*df.doggyness)**0.5 + (-1*df.hairyness)**0.5
df['hairy_cat'] = (-1*df.doggyness)**0.5 + (1*df.hairyness)**0.5
df['hairy_dog'] = (1*df.doggyness)**0.5 + (1*df.hairyness)**0.5
In [384]:
show = 'naked_cat'
df = df.sort_values(by=show,ascending=False)
show_valid(df.index[0:5])
In [386]:
show = 'naked_dog'
df = df.sort_values(by=show,ascending=False)
show_valid(df.index[0:5])
In [387]:
show = 'hairy_cat'
df = df.sort_values(by=show,ascending=False)
show_valid(df.index[0:5])
In [385]:
show = 'hairy_dog'
df = df.sort_values(by=show,ascending=False)
show_valid(df.index[0:5])